import pandas as pd
'display.max_colwidth', None) # Set to display full-width dataframe pd.set_option(
Text Processing & Labelling [Part2]
Tutorials / Implementations
NLP
Sample of labelling data for Sentiment Analysis task
The full notebook is available here.
Labelling
= pd.read_csv("preprocessed.csv") df
You can download the dataset here.
= df[df.lang == "en"] filtered_df
Step 1: Segment Document into Sentences
We want to decrease the complexity when there are multiple sentences with different polarities in a doc. Therefore, instead of making prediction on a doc, we do for a sentence.
Note: This way also does not guarantee that we will not have a sentence with 2 conflict polarities. Yet, it reduces the proability of that situation.
def segment_review(df):
= list(df["review"])
reviews = list(df["star_rating"])
ratings = []
new_reviews = []
new_ratings for i in range(len(reviews)):
= nlp(reviews[i])
doc = ratings[i]
rating for sent in doc.sents:
str(sent))
new_reviews.append(
new_ratings.append(rating)return new_reviews, new_ratings
= pd.DataFrame({"review": filtered_df["lower_case"], "star_rating": filtered_df["star_rating"]}) doc_df
# Check NaN
"star_rating"].isnull().sum() df[
0
= segment_review(doc_df) new_texts, new_ratings
= pd.DataFrame({"review": new_texts, "star_rating": new_ratings}) segmented_df
print("Number of data after segmenting:", len(segmented_df))
Number of data after segmenting: 35448
"segmented_df.csv", index=False) segmented_df.to_csv(
Step 2: Add labels using rating and pretrained-models’ predictions
= segmented_df.copy(deep=True) label_df
Rating Label
def rating2label(rating):
if rating == 3:
return "Neutral"
elif rating < 3:
return "Negative"
else:
return "Positive"
def score2label(score):
if score == 1:
return "Positive"
else:
return "Negative"
# Add rating labels
"rating_label"] = label_df["star_rating"].apply(rating2label) label_df[
BERT Label
%%capture
!pip install transformers
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
def get_classifier(model_name, **kwargs):
= kwargs.get("id2label")
id2label if id2label:
= AutoModelForSequenceClassification.from_pretrained(model_name, id2label=id2label)
model else:
= AutoModelForSequenceClassification.from_pretrained(model_name)
model = AutoTokenizer.from_pretrained(model_name)
tokenizer = pipeline("text-classification", model=model, tokenizer=tokenizer)
classifier return classifier
Model 1: nlptown/bert-base-multilingual-uncased-sentimen
# Load pretrained model
= "nlptown/bert-base-multilingual-uncased-sentiment"
model_name = get_classifer(model_name) classifier
# Function to convert the model output to label
def get_prediction(text):
= int(classifier(text)[0]['label'].split()[0])
rating return rating2label(rating)
"nlptown_bert_label"] = label_df['review'].apply(get_prediction) label_df[
label_df.head()
"label_df.csv", index=False) label_df.to_csv(
'review'][0]) get_prediction(label_df[
# Check number of neutral
"nlptown_bert_label"] == "Neutral"][["review"]] label_df[label_df[
Model 2: cardiffnlp/twitter-roberta-base-sentiment
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
= "cardiffnlp/twitter-roberta-base-sentiment"
model_name = {0: "Negative", 1: "Neutral", 2: "Positive"}
id2label = get_classifier(model_name, id2label=id2label) classifier
{'id2label': {0: 'Negative', 1: 'Neutral', 2: 'Positive'}}
def get_prediction(text):
return classifier(text)[0]['label']
"twitter-robert_label"] = label_df['review'].apply(get_prediction) label_df[
label_df.head()
review | star_rating | rating_label | nlptown_bert_label | twitter-robert_label | |
---|---|---|---|---|---|
0 | i love my new laptop! | 5 | Positive | Positive | Positive |
1 | best computer i have ever own! | 5 | Positive | Positive | Positive |
2 | this computer forces me to be productive. | 5 | Positive | Positive | Positive |
3 | i used to wait around for this spinning wheel to stop, | 5 | Positive | Negative | Neutral |
4 | and now i can do everything so quickly | 5 | Positive | Positive | Positive |
"review"] = label_df["review"].str.strip() label_df[
"label_df_full.csv", index=False) label_df.to_csv(
Model 3: LSTM (Optional)
# Label by LSTM model
from allennlp.predictors.predictor import Predictor
= Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/basic_stanford_sentiment_treebank-2020.06.09.tar.gz")
predictor
= list()
lstm_label for i, review in enumerate(new_df['review']):
= score2label(int(predictor.predict(review)['label']))
label lstm_label.append(label)
Plugin allennlp_models could not be loaded: No module named 'nltk.translate.meteor_score'
"LSTM_label"] = lstm_label label_df[
Analyze labels and conflicts
%%capture
!python -m pip install -U matplotlib # The code requires to use the lastest version of matplotlib
# Load labeled dataset
= pd.read_csv("./label_df_full.csv") label_df
import seaborn as sns
def draw_bar_graph(data, title=None ,x_label=None, y_label=None):
="white")
sns.set_theme(style= [float(v) for v in data.values()]
y_values = sns.barplot(x=list(data.keys()), y=y_values)
ax 0])
ax.bar_label(ax.containers[0,max(y_values) + 2000])
ax.set_ylim([=16)
ax.set_title(title, fontsize
ax.set_xlabel(x_label) ax.set_ylabel(y_label)
= label_df[~(label_df['rating_label'] == label_df['nlptown_bert_label'])] conflict_rating_nlptown
= list(label_df[~(label_df['twitter-robert_label'] == label_df['nlptown_bert_label'])]['review']) conflict_nlptown_roberta
= list(label_df[~(label_df['twitter-robert_label'] == label_df['rating_label'])]['review']) conflict_rating_roberta
= {"rating-roberta": len(conflict_rating_roberta), "rating-nlptown": len(conflict_rating_nlptown), "nlptown-roberta": len(conflict_nlptown_roberta)} conflicts
="Conflicts between labels", y_label="Number of samples") draw_bar_graph(conflicts, title
# Only count the samples has alphabet characters
= conflict_rating_nlptown[conflict_rating_nlptown['nlptown_bert_label'] != "Neutral"]
non_neutral = non_neutral[non_neutral['review'].str.lower().str.islower()]
character_exist len(character_exist)
7344
# Only count the samples has label as neutral and label conflict between nlptown_bert_label and twitter_robert_label
= conflict_rating_nlptown[conflict_rating_nlptown['nlptown_bert_label'] == "Neutral"]
neutral = neutral[neutral["nlptown_bert_label"] != neutral["twitter-robert_label"]]
neutral_2_bert_conflict len(neutral_2_bert_conflict)
2348
= []
review = []
rating_label = []
nlptown_bert_label = []
twitter_robert_label += list(character_exist["review"]) + list(neutral_2_bert_conflict["review"])
review += list(character_exist["rating_label"]) + list(neutral_2_bert_conflict["rating_label"])
rating_label += list(character_exist["nlptown_bert_label"]) + list(neutral_2_bert_conflict["nlptown_bert_label"])
nlptown_bert_label += list(character_exist["twitter-robert_label"]) + list(neutral_2_bert_conflict["twitter-robert_label"]) twitter_robert_label
= {"review": review, "rating_label": rating_label, "nlptown_bert_label": nlptown_bert_label, "twitter-bert_label": twitter_robert_label} conflict
= pd.DataFrame(conflict) conflict_df
conflict_df.head()
review | rating_label | nlptown_bert_label | twitter-bert_label | |
---|---|---|---|---|
0 | i used to wait around for this spinning wheel to stop, | Positive | Negative | Neutral |
1 | ha ha. | Positive | Negative | Neutral |
2 | i do not have time to get coffee anymore. | Positive | Negative | Negative |
3 | i had an issue with my laptop. | Positive | Negative | Negative |
4 | the battery life drained fast | Positive | Negative | Negative |
"conflict_df.csv", index=False) conflict_df.to_csv(
print("Total numbers of samples we need to label is: ", len(conflict_df))
Total numbers of samples we need to label is: 9692
Label Rules
- Accept the label if the
rating_label
,nlptown_bert_label
both agree on the label. I don’t take in to account thetwitter-robert_label
in this situation since it is trained using tweets, not product review, which is not reliable compared to thenlptown_bert_label
. Because of the human resource, it is not always to produce the perfect label for the dataset. Yet, the dataset sill can be improved later using Active Learning. - Accept the Neutral label if both
nlptown_bert_label
andtwitter-robert_label
both agree on the label. The resonale for this is that when we do the sentence segmentation, some sentences from the positive/negative documents (labeled by star) have neutral sentiments, but still get labeled as positive or negative due to the rating_label. - Accept sample which review has alphabet characters. This rule will remove sample text such as “4.”, “!”, etc.
Distribution of labels in accepted data
= label_df[label_df["rating_label"] == label_df["nlptown_bert_label"]] accepted_df
= len(accepted_df[accepted_df["rating_label"] == "Neutral"])
num_neutral = len(accepted_df[accepted_df["rating_label"] == "Positive"])
num_positive = len(accepted_df[accepted_df["rating_label"] == "Negative"]) num_negative
= {"Negative": num_negative, "Neutral": num_neutral, "Positive": num_positive}
data ="Distribution of classes", x_label="Classes", y_label="Number of samples") draw_bar_graph(data, title
The dataset is unbalanced and the class has the least number of sample is Neutral, which is what we should expect. The reason is since we segment the doc to multiple sentences, there will be some sentences has neutral polarity but still get labeled as positive or negative due to the rating_label
.
Upload to Lable Studio
def generate_import_json(text, labels):
= {}
json_instance 'data'] = {"text": text}
json_instance['annotations'] = []
json_instance[for label in labels:
'annotations'].append({
json_instance["result": [
{"type": "choices",
"value": { "choices": [label] },
"to_name": "text",
"from_name": "sentiment"
}
]
})return json_instance
= df.drop(conflict_df.index) labeled_df
"labeled_df.csv", index=False) labeled_df.to_csv(
= pd.read_csv("relabeled_data.csv") relabeled_df
'annotation_id'] > 60000] relabeled_df[relabeled_df[
text | id | sentiment | annotator | annotation_id | |
---|---|---|---|---|---|
0 | no es la mejor. | 521560 | Neutral | admin@slsops.gmail.com | 68106 |
4 | Ese es su único defecto, | 521559 | Neutral | admin@slsops.gmail.com | 68105 |
8 | 3 horas en trabajo medio, con varios aplicativ... | 521558 | Neutral | admin@slsops.gmail.com | 68104 |
12 | This machine is NOT a touchscreen. | 521465 | Negative | admin@slsops.gmail.com | 68011 |
16 | Also when I try to use my audio interface in L... | 521453 | Neutral | admin@slsops.gmail.com | 67999 |
... | ... | ... | ... | ... | ... |
672 | Like: size, weight, screen picture, battery us... | 521395 | Positive | admin@slsops.gmail.com | 67941 |
676 | Almost too perfect. | 521394 | Positive | admin@slsops.gmail.com | 67940 |
680 | I don’t have time to get coffee anymore. | 521393 | Negative | admin@slsops.gmail.com | 67939 |
684 | Ha ha. | 521392 | Positive | admin@slsops.gmail.com | 67937 |
688 | I used to wait around for this spinning wheel ... | 521391 | Positive | admin@slsops.gmail.com | 67936 |
173 rows × 5 columns
= []
samples = list(conflict_df['rating_label'])
rating_label = list(conflict_df['LSTM_label'])
LSTM_label = list(conflict_df['distilbert_label'])
distilbert_label = list(conflict_df['text'])
conflict_text for i in range(len(conflict_df)):
samples.append(generate_import_json(conflict_text[i], [rating_label[i], LSTM_label[i], distilbert_label[i]]))
import requests
def upload_2_labelstudio(samples, project_num):
= {'Content-Type': 'application/json', 'Authorization': 'Token 05f1e1540050e570826c2f6229b4a0a20bde2d1f'}
headers = f'https://label.slsops.athenka.com/api/projects/{project_num}/import'
url = requests.post(url, headers=headers, data=samples)
r print(r.text)
import json
for i in range(0, len(conflict_df), 500):
= json.dumps(samples[i:i+500])
json_data 274) upload_2_labelstudio(json_data,
= [
sample_json
{"data": { "text": "This is a test sentence" },
"annotations": [
{"result": [
{"type": "choices",
"value": { "choices": ["Negative"] },
"to_name": "text",
"from_name": "sentiment"
}
]
},
{"result": [
{"type": "choices",
"value": { "choices": ["Neutral"] },
"to_name": "text",
"from_name": "sentiment"
}
]
},
{"result": [
{"type": "choices",
"value": { "choices": ["Positive"] },
"to_name": "text",
"from_name": "sentiment"
}
]
}
]
} ]